Data import

library(foreign)
if (!file.exists("hsb.dta12")){
    download.file("http://www.stata-press.com/data/mlmus3/hsb.dta",
                  destfile = "hsb.dta12")
}
hsb <- read.dta("hsb.dta12")

Summarize the data import

head(hsb)
##   minority female    ses mathach size sector pracad disclim himinty
## 1        0      1 -1.528   5.876  842      0   0.35   1.597       0
## 2        0      1 -0.588  19.708  842      0   0.35   1.597       0
## 3        0      0 -0.528  20.349  842      0   0.35   1.597       0
## 4        0      0 -0.668   8.781  842      0   0.35   1.597       0
## 5        0      0 -0.158  17.898  842      0   0.35   1.597       0
## 6        0      0  0.022   4.583  842      0   0.35   1.597       0
##   schoolid     mean       sd    sdalt     junk   sdalt2 num       se
## 1     1224 9.715446 7.592785 6.256328 45.71077 48.39363  47 1.107522
## 2     1224 9.715446 7.592785 6.256328 49.99941 48.39363  47 1.107522
## 3     1224 9.715446 7.592785 6.256328 59.47536 48.39363  47 1.107522
## 4     1224 9.715446 7.592785 6.256328 14.86853 48.39363  47 1.107522
## 5     1224 9.715446 7.592785 6.256328 27.67840 48.39363  47 1.107522
## 6     1224 9.715446 7.592785 6.256328 64.86649 48.39363  47 1.107522
##       sealt   sealt2       t2    t2alt pickone     mmses     mnses
## 1 0.9125792 1.014718 6.958498 8.289523       1 -0.434383 -0.434383
## 2 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 3 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 4 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 5 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 6 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
##         xb     resid
## 1 10.13759 -4.261589
## 2 10.13759  9.570412
## 3 10.13759 10.211412
## 4 10.13759 -1.356588
## 5 10.13759  7.760412
## 6 10.13759 -5.554588
rockchalk::summarize(hsb)
## Numeric variables
##            minority   female      ses     mathach     size    sector 
## min           0          0        -3.76     -2.83    100         0   
## med           0          1         0        13.13   1016         0   
## max           1          1         2.69     24.99   2713         1   
## mean          0.27       0.53      0        12.75   1056.86      0.49
## sd            0.45       0.50      0.78      6.88    604.17      0.50
## skewness      1.01      -0.11     -0.23     -0.18      0.57      0.03
## kurtosis     -0.98      -1.99     -0.38     -0.92     -0.36     -2   
## nobs       7185       7185      7185      7185      7185      7185   
## nmissing      0          0         0         0         0         0   
##            pracad    disclim   himinty   schoolid     mean      sd   
## min           0        -2.42      0      1224          4.24      3.54
## med           0.53     -0.23      0      5192         13.16      6.30
## max           1         2.76      1      9586         19.72      8.48
## mean          0.53     -0.13      0.28   5277.90      12.75      6.20
## sd            0.25      0.94      0.45   2499.58       3.01      0.86
## skewness      0.16      0.24      0.98      0.11      -0.27     -0.24
## kurtosis     -0.89     -0.16     -1.04     -1.25      -0.05      0.22
## nobs       7185      7185      7185      7185       7185      7185   
## nmissing      0         0         0         0          0         0   
##             sdalt      junk    sdalt2      num       se       sealt 
## min           6.26      0        48.39     14         0.51      0.76
## med           6.26     30.63     48.39     51         0.89      0.88
## max           6.26    239.29     48.39     67         1.82      1.67
## mean          6.26     47.32     48.39     48.02      0.92      0.92
## sd            0        48.90      0        10.82      0.20      0.13
## skewness       NaN      1.30       NaN     -0.58      1.11      1.59
## kurtosis       NaN      1.46       NaN     -0.37      2.32      3.25
## nobs       7185      7185      7185      7185      7185      7185   
## nmissing      0         0         0         0         0         0   
##            sealt2      t2       t2alt    pickone    mmses     mnses 
## min           0.85      0         0         0        -1.19     -1.19
## med           0.97      5.75      4.36      0         0.03      0.03
## max           1.86    195.81     52.82      1         0.82      0.82
## mean          1.03     14.66      8.54      0.02      0         0   
## sd            0.14     26.42     11.06      0.15      0.41      0.41
## skewness      1.59      3.67      2.06      6.47     -0.27     -0.27
## kurtosis      3.25     16.89      4.24     39.92     -0.48     -0.48
## nobs       7185      7185      7185      7185      7185      7185   
## nmissing      0         0         0         0         0         0   
##              xb       resid 
## min           5.68    -19.49
## med          12.87      0.24
## max          17.52     16.44
## mean         12.69      0.06
## sd            2.42      6.46
## skewness     -0.27     -0.14
## kurtosis     -0.48     -0.69
## nobs       7185      7185   
## nmissing      0         0
str(hsb)
## 'data.frame':    7185 obs. of  26 variables:
##  $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ female  : int  1 1 0 0 0 0 1 0 1 0 ...
##  $ ses     : num  -1.528 -0.588 -0.528 -0.668 -0.158 ...
##  $ mathach : num  5.88 19.71 20.35 8.78 17.9 ...
##  $ size    : int  842 842 842 842 842 842 842 842 842 842 ...
##  $ sector  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pracad  : num  0.35 0.35 0.35 0.35 0.35 ...
##  $ disclim : num  1.6 1.6 1.6 1.6 1.6 ...
##  $ himinty : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ schoolid: num  1224 1224 1224 1224 1224 ...
##  $ mean    : num  9.72 9.72 9.72 9.72 9.72 ...
##  $ sd      : num  7.59 7.59 7.59 7.59 7.59 ...
##  $ sdalt   : num  6.26 6.26 6.26 6.26 6.26 ...
##  $ junk    : num  45.7 50 59.5 14.9 27.7 ...
##  $ sdalt2  : num  48.4 48.4 48.4 48.4 48.4 ...
##  $ num     : num  47 47 47 47 47 47 47 47 47 47 ...
##  $ se      : num  1.11 1.11 1.11 1.11 1.11 ...
##  $ sealt   : num  0.913 0.913 0.913 0.913 0.913 ...
##  $ sealt2  : num  1.01 1.01 1.01 1.01 1.01 ...
##  $ t2      : num  6.96 6.96 6.96 6.96 6.96 ...
##  $ t2alt   : num  8.29 8.29 8.29 8.29 8.29 ...
##  $ pickone : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ mmses   : num  -0.434 -0.434 -0.434 -0.434 -0.434 ...
##  $ mnses   : num  -0.434 -0.434 -0.434 -0.434 -0.434 ...
##  $ xb      : num  10.1 10.1 10.1 10.1 10.1 ...
##  $ resid   : num  -4.26 9.57 10.21 -1.36 7.76 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr "13 Feb 2017 09:05"
##  - attr(*, "formats")= chr  "%8.0g" "%8.0g" "%9.0g" "%9.0g" ...
##  - attr(*, "types")= int  251 251 254 254 252 251 254 254 251 254 ...
##  - attr(*, "val.labels")= chr  "" "" "" "" ...
##  - attr(*, "var.labels")= chr  "" "" "" "" ...
##  - attr(*, "version")= int 12
if(interactive()) kutils::peek(hsb)

Create a factor variable for school

I always create a factor when there is danger that a classifier variable is coded as an integer.

hsb$schoolidf <- factor(hsb$schoolid)

3.7.1 Random effect model

library(lme4)
## Loading required package: Matrix
## Loading required package: methods
m1_hs <- lmer(mathach ~ ses + (1 | schoolidf), data = hsb, REML = FALSE)
summary(m1_hs)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: mathach ~ ses + (1 | schoolidf)
##    Data: hsb
## 
##      AIC      BIC   logLik deviance df.resid 
##  46649.0  46676.5 -23320.5  46641.0     7181 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1263 -0.7277  0.0220  0.7578  2.9186 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  schoolidf (Intercept)  4.729   2.175   
##  Residual              37.030   6.085   
## Number of obs: 7185, groups:  schoolidf, 160
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  12.6576     0.1873   67.57
## ses           2.3915     0.1057   22.63
## 
## Correlation of Fixed Effects:
##     (Intr)
## ses 0.003

3.7.3 Create mean of ses and group mean deviations scores.

We are solving part 3 before part 2 because the calculations here are helpful in doing part 2, which is next.

The hsb data has columns “mnses” but we have no docuumentation on how they are created. I don’t trust entirely, so lets create our own group mean and group mean deviations variables.

##' Generate group summaries and individual deviations within groups
##'
##' Similar to Stata egen, except more versatile and fun! Will
##' create a new data frame with 2 columns. Rows match
##' the rows of the original data frame. 
##'
##' Now I return only the new columns
##' @param dframe a data frame
##' @param x Variable names or a vector of variable names
##' @param by A grouping variable name or a vector of grouping names
##' @param FUN Defaults to the mean, have not tested alternatives
##' @param suffix The suffixes to be added to column 1 and column 2
##' @return new data frame with "x_mn" and "x_dev" added as variables
##' @author Paul Johnson
##' @examples
##' Suppose you get the MLMUS hsb data frame, somehow
##' xx1 <- gmd(hsb, "ses", "schoolidf")
##' xx2 <- gmd(hsb, c("ses", "female"), "schoolidf")
##' xx3 <- gmd(hsb, c("ses", "female"), c("schoolidf", "sector"))
##' xx4 <- gmd(hsb, c("ses", "female"),
##'                    c("schoolidf"), FUN = median)
gmd <- function(dframe, x, by, FUN = mean, suffix = c("_mn", "_dev")){
    meanby <- aggregate(dframe[ , x, drop = FALSE],
                       dframe[ , by, drop = FALSE], FUN,
                       na.rm = TRUE)
    df2 <- merge(dframe, meanby, by = by, suffix = c("", suffix[1]), sort = FALSE)
    if(!all.equal(rownames(df2), rownames(dframe))){
        MESSG <- "rows were shuffled"
        stop(MESSG)
    }
    for(i in x){
        df2[ , paste0(i, suffix[2])] <- df2[ , i] - df2[ , paste0(i, suffix[1])]
    }
    ##gives back just the means and deviations part
    ##df2[ , colnames(df2)[!colnames(df2) %in% colnames(dframe)]]
    ##gives back whole data frame
    attr(df2, "meanby") <- meanby
    df2
}

hsb2 <- gmd(hsb, c("ses", "female"), "schoolidf")
head(hsb2[ , c("schoolidf", "ses", "ses_mn", "ses_dev", "female", "female_mn", "female_dev")])
##   schoolidf    ses    ses_mn     ses_dev female female_mn female_dev
## 1      1224 -1.528 -0.434383 -1.09361702      1 0.5957447  0.4042553
## 2      1224 -0.588 -0.434383 -0.15361702      1 0.5957447  0.4042553
## 3      1224 -0.528 -0.434383 -0.09361702      0 0.5957447 -0.5957447
## 4      1224 -0.668 -0.434383 -0.23361701      0 0.5957447 -0.5957447
## 5      1224 -0.158 -0.434383  0.27638297      0 0.5957447 -0.5957447
## 6      1224  0.022 -0.434383  0.45638298      0 0.5957447 -0.5957447

Summary information for school mean SES

rockchalk::summarize(hsb2$ses_mn, digits=6)
## Numeric variables
##            hsb2$ses_mn
## min          -1.193946
## med           0.032000
## max           0.824982
## mean          0.000143
## sd            0.413543
## skewness     -0.268465
## kurtosis     -0.478910
## nobs       7185       
## nmissing      0

2 Summarize between school and within school SES diversity

The Stata function xtsum produces a table. We can gather same informaiton

## overall is individual level information
(sesmean <- mean(hsb[ , "ses"], na.rm = TRUE))
## [1] 0.000143354
(sessd <- sd(hsb[ , "ses"], na.rm = TRUE))
## [1] 0.7793552
(sesrange <- range(hsb[ , "ses"], na.rm = TRUE))
## [1] -3.758  2.692
## between is diversity among school means
sesbyschool <- attr(hsb2, "meanby")[ , "ses", drop=FALSE]
## sd of school means is called 'between' sd
(sessdbtschool <- sd(sesbyschool[ , "ses"]))
## [1] 0.4139706
(sesrangebtschool <- range(sesbyschool[ , "ses"]))
## [1] -1.1939460  0.8249825
## within is name for variety of ses_mn
sd(hsb2[ , "ses_dev"])
## [1] 0.660588
range(hsb2[ , "ses_dev"])
## [1] -3.650741  2.856078

Stata output has value T-bar, which is the mean of sample sizes within schools.

cat("The number of schools is \n")
## The number of schools is
(J <- NROW(sesbyschool))
## [1] 160
cat("The average number of students per school is \n")
## The average number of students per school is
Nj <- aggregate(hsb2[ , "ses", drop=FALSE], hsb2[ , "schoolidf", drop = FALSE], NROW)
mean(Nj[ , "ses"], na.rm = TRUE)
## [1] 44.90625

Linear Mixed Models

The model that uses ses without group centring was already estimated, m1_hs.

As luck would have it, it is necessary to rerun the same model using the hsb2 data set in order to conduct the hypothesis tests. If we don’t do this, the anova test will fail, saying “Error in anova.merMod(m1_hs, m2_hs) : all models must be fit to the same data object”.

library(lme4)
m1_hs <- lmer(mathach ~ ses + (1 | schoolidf), data = hsb2, REML = FALSE)
summary(m1_hs)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: mathach ~ ses + (1 | schoolidf)
##    Data: hsb2
## 
##      AIC      BIC   logLik deviance df.resid 
##  46649.0  46676.5 -23320.5  46641.0     7181 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1263 -0.7277  0.0220  0.7578  2.9186 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  schoolidf (Intercept)  4.729   2.175   
##  Residual              37.030   6.085   
## Number of obs: 7185, groups:  schoolidf, 160
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  12.6576     0.1873   67.57
## ses           2.3915     0.1057   22.63
## 
## Correlation of Fixed Effects:
##     (Intr)
## ses 0.003

The question asks us to find out if the impact os ses_mn and ses_dev are statistically significantly different from each other.

m2_hs <- lmer(mathach ~ ses_mn + ses_dev + (1 | schoolid), data = hsb2, REML = FALSE)
summary(m2_hs)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: mathach ~ ses_mn + ses_dev + (1 | schoolid)
##    Data: hsb2
## 
##      AIC      BIC   logLik deviance df.resid 
##  46573.8  46608.2 -23281.9  46563.8     7180 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1675 -0.7258  0.0176  0.7554  2.9446 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  schoolid (Intercept)  2.647   1.627   
##  Residual             37.014   6.084   
## Number of obs: 7185, groups:  schoolid, 160
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  12.6836     0.1484   85.46
## ses_mn        5.8656     0.3594   16.32
## ses_dev       2.1912     0.1087   20.17
## 
## Correlation of Fixed Effects:
##         (Intr) ses_mn
## ses_mn  0.010        
## ses_dev 0.000  0.000

3.7.4 Comparison of the coefficients

We want to do a hypothesis test to find if the coefficients for ses_mn and ses_dev are the same. We can do this various ways.

Lets experiment.

My fancy-t test code (adapted)

I’ll adapt my fancy t-test code from a previous problem (Ex-01.2-anorexia.R). I noticed there were two errors because the methods of extracting information are slightly different in lme4. Basically, a short-cut I took that worked for lm objects was not compatible with a mixed model object. I think the changes here are backward portable linear models. I may put this in the rockchalk package.

##' T-test for the difference in 2 regression parameters
##'
##' This is the one the students call the "fancy t test".
##'
##' I did this because I have trouble understanding terminology in
##' canned functions in other R packages. It has an additional
##' feature, it can import robust standard errors to conduct the test.
##' 
##' @param parm1 A parameter name, in quotes!
##' @param parm2 Another parameter name, in quotes!
##' @param model A fitted regression model
##' @return A vector with the difference, std. err., t-stat,
##' and p value
##' @author Paul Johnson
fancyt <- function(parm1, parm2, model, model.cov = NULL){
    V <- function(mat, parm1, parm2 = NULL) {
        if(is.null(parm2)) return (mat[parm1, parm1])
        else return(mat[parm1, parm2])
    }
    if(is.null(model.cov)) model.cov <- vcov(model)
    ## following does not succeed with lme4 object
    ## model.coef <- coef(model)[c(parm1, parm2)]
    model.coef <- coef(summary(model))[c(parm1, parm2), "Estimate"]
    se <- sqrt(V(model.cov, parm1) + V(model.cov, parm2) - 2*V(model.cov, parm1, parm2))
    diff <- model.coef %*% c(1, -1)
    t <-  diff/se
    df <- df.residual(model)
    pval <- pt(abs(t), lower.tail = FALSE, df = df) * 2
    formatC(c(diff = diff, Std.Err. = se, t = t, p = pval))
}

I get the same results as Stata’s lincom function:

fancyt("ses_mn", "ses_dev", m2_hs)
##        diff    Std.Err.           t           p 
##     "3.674"    "0.3754"     "9.787" "1.766e-22"

I am not generating a confidence interval for that. You could: 3.674 +/- 1.96 * 0.3754. The multiplier is a rule of thumb, you’d need to figure that out to get your beloved CI.

multcomp t test

The comparison of coefficients can also be done in various R packages. For example, multcomp. The function in multcomp is glht, which stands for “generalized linear hypothesis test”.

library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
m2_hs.mcomp <- glht(m2_hs, linfct = "ses_mn - ses_dev = 0")
summary(m2_hs.mcomp)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lmer(formula = mathach ~ ses_mn + ses_dev + (1 | schoolid), data = hsb2, 
##     REML = FALSE)
## 
## Linear Hypotheses:
##                       Estimate Std. Error z value Pr(>|z|)    
## ses_mn - ses_dev == 0   3.6744     0.3754   9.787   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

Likelihood ratio test works as well

The first model, the one that includes only ses, is actually nested within the second one. The second one estimates 2 coefficients for ses_mn and ses_dev. If we restrict those coefficients to be identical, it is the same as estimating ses.

anova(m1_hs, m2_hs)
## Data: hsb2
## Models:
## m1_hs: mathach ~ ses + (1 | schoolidf)
## m2_hs: mathach ~ ses_mn + ses_dev + (1 | schoolid)
##       Df   AIC   BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
## m1_hs  4 46649 46677 -23320    46641                             
## m2_hs  5 46574 46608 -23282    46564 77.195      1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

I’m pretty sure that there is a reasonable argument in favor of this likelihood ratio test, rather than the t-test used by my fancyt or by the glht in multcomp. The reason for this that the distributions and the number of degrees of freedom of individual coefficients are still controversial, but there is much less controversy about the likelihood ratio test here.

Note that the predicted values of the 2 models are not identical. That’s a symptom of the violation assumption:

plot(predict(m1_hs), predict(m2_hs),
     xlab = "ses only", ylab = "ses_mn and ses_dev")

3.7.5 Interpretation

In the class notes, I have an explanation for why the estimates of the effect of ses_mn and ses_dev are important.

Simply put, if the regression model has ses as a predictor, we should obtain an equivalent estimate if we replace it by two mathematically equivalent variables:

\[ \beta \alpha x_1 \overline{ses}_{j} \]

\[ ses_{ji} = \overline{ses}_{j} + \{ses_{ji} - \overline{ses}_{j}\} \]

Note that the previous simply adds and subtracts the same quantity on the RHS, \(\overline{ses}_{j}\). It stands to reason, then that the regression has the same meaning if we include \(ses_{ji}\) by itself or if we include \(\overline{ses}_{j}\) and \(\{ses_{ji} - \overline{ses}_{j}\}\) as predictors.

The coefficient ses_mn is often interpreted as the “between effect”, meaning the impact of a student belonging in one school. On the other hand, ses_dev is the status variation between children who are “group mean centered” on ses.

ses_mn summarizes the differences among schools in socio economic status.

If the coefficients ses_mn and ses_dev are the same, then we are passing one “specification test” for the multilevel model.

If the coefficients are different, it means the slope of the “within-group” regression line is different from the slope of the “between-group” regression line.

plot(mathach ~ ses, data = hsb)

plot(mathach ~ ses, data = hsb2, col = rainbow(hsb2$schoolid))

lm1 <- lm(mathach ~ ses, data = hsb2)
rockchalk::plotSlopes(lm1, plotx = "ses", interval = "confidence")

lm2 <- lm(mathach ~ ses_mn + ses_dev, data = hsb2)
rockchalk::plotSlopes(lm2, plotx = "ses_mn", interval = "confidence")

lm2 <- lm(mathach ~ ses_mn + ses_dev, data = hsb2)
rockchalk::plotSlopes(lm2, plotx = "ses_dev", interval = "confidence")

Check a fixed effects model

lm3 <- lm(mathach ~ ses*schoolidf, data = hsb2)
rockchalk::plotSlopes(lm3, plotx = "ses", modx = "schoolidf", interval = "confidence", plotLegend=FALSE)

library(lattice)
xyplot(mathach ~ ses | schoolidf, data = hsb2, type = c("p", "r"))

library(lattice)
xyplot(mathach ~ ses | schoolidf, data = hsb2, subset = schoolid < 3000,
       type = c("p", "r"))

xyplot(mathach ~ ses | schoolidf, data = hsb2,
       subset = schoolid > 3000 & schoolid <= 5000, type = c("p", "r"))

xyplot(mathach ~ ses | schoolidf, data = hsb2,
       subset = schoolid > 5000 & schoolid <= 7000, type = c("p", "r"))